0. Loading in libraries

candidate_number <- 48626
set.seed(candidate_number)

1. Simulation exercise (Watches)

1.1 Data generation

# Number of observations
set.seed(candidate_number) # For reproducible results
n <- 2000  

## Categorical variables
# Generate Watch_Type
Watch_Type <- factor(sample(c("Luxury", "Mid-Range", "Budget"), n, replace = TRUE, prob = c(0.4, 0.3, 0.3)))

# Generate Case_Material
Case_Material <- factor(sample(c("Gold", "Titanium", "Stainless Steel", "Plastic"), n, replace = TRUE))

## Numeric variables
# Generate Feature_Count ranging from 0 to 6
Feature_Count <- sample(0:6, n, replace = TRUE)

# Generate Dimension_mm
Dimension_mm <- round(runif(n, 30, 50), 1)  # Random dimensions between 30mm and 50mm

# Generate Price_USD
Price_USD <- sapply(Watch_Type, function(type) {
  if (type == "Luxury") {
    round(runif(1, 5000, 80000), 2)
  } else if (type == "Mid-Range") {
    round(runif(1, 1000, 10000), 2) 
  } else {
    round(runif(1, 10, 2000), 2)
  }
})


# Create the dataframe with the predictors
watch_data <- data.frame(
  Watch_Type = Watch_Type,         # Watch type
  Case_Material = Case_Material,   # Case material
  Feature_Count = Feature_Count,   # Feature count (0 to 6)
  Dimension_mm = Dimension_mm,     # Diameter or width in mm
  Price_USD = Price_USD            # Price in USD
)

1.2 CEF

set.seed(candidate_number)
# Create CEF
CEF <- function(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD) {
  # Base probability 
  base <- 0.3
  
  # Case Material effect
  material_effect <- ifelse(
    # stronger positive relationship for watches made of gold
    Case_Material == "Gold", 0.1,
    # weaker positive relationship for watches made of titanium
    ifelse(Case_Material == "Titanium", 0.05,
    # no relationship for watches made of stainless steel. Negative relationship for all other materials (e.g. plastic) 
           ifelse(Case_Material == "Stainless Steel", 0, -0.05))
  )
  
  # Feature Count effect
  feature_effect <- 0.02 * Feature_Count  # Each feature adds 0.02 to the probability
  
  # Dimension effect
  dimension_effect <- ifelse(
    Dimension_mm < 36, -0.05,    # Negative relationship for watches too small (<36mm)
    ifelse(Dimension_mm <= 42, 0.05, -0.02)  # Positive relationship for watches between 36-42mm, negative relationship for watches too big (>42)
  )
  
  # Because the price was generated based on the watch_type, the price effect will depend on       the watch_type. This is for feature interaction.
  
  price_effect <- ifelse(
    # Non-linear positive relationship for luxury watches and price
    Watch_Type == "Luxury", 0.001 * log(Price_USD + 1),
    # Weaker non-linear positive relationship for mid-range watches and price
    ifelse(Watch_Type == "Mid-Range", 0.0005 * log(Price_USD + 1),
           -0.0001 * Price_USD) # linear negative correlation for budget watch
  )
  
  # Add noise for randomness
  noise <- rnorm(length(Watch_Type), mean = 0, sd = 0.02)
  
  # Summation of all effects
  prob <- base + material_effect + feature_effect + dimension_effect + price_effect + noise
  
  # Probabilities must be in 0-1 range
  prob <- pmin(pmax(prob, 0), 1)
  
  return(prob)
}

# Add the CEF to data, store as Purchase_Probability
watch_data$Purchase_Probability <- with(
  watch_data,
  CEF(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD)
)

2. Additive models

set.seed(candidate_number)
watch_data$Feature_Count <- as.factor(watch_data$Feature_Count) 
# due to low sparsity and the discrete nature of feature_count, it will be treated as a categorical

# Oracle GAM to capture interaction between Price_USD and Watch_Type
gam_oracle <- gam(
  Purchase_Probability ~ 
    s(Price_USD, k = 5) + 
    s(Dimension_mm, k = 5) +
    Feature_Count +
    Case_Material + 
    s(Price_USD, by = Watch_Type), # Interaction between Price_USD and Watch_Type
  family = quasibinomial, 
  data = watch_data
)
# k=5 for each numeric predictor to allow more flexibility with the interactions

# Simple GAM (no interactions)
gam_simple <- gam(
  Purchase_Probability ~ 
    s(Price_USD, k = 3) +
    s(Dimension_mm, k = 3) + 
    Watch_Type + 
    Case_Material + 
    Feature_Count,
  family = quasibinomial,
  data = watch_data
)
# k is reduced to 3 for numeric predictors to maintain simplicity for the simple model

# s() function (smooth function) was used for the 2 numerical variables (Price_USD and Dimension_mm) due to the CEF demonstrating non-linear, but smooth, relationships between the respective variables and the outcome.
# For price, the relationship for luxury and mid-range watches are modelled logarithmically
# For dimension, probability decreases from too low or too high measurements

# Family = quasibinomial because the outcome is a probability, a continuous variable lying between 0 and 1. 
# the quasibinomial family accounts for probabilities between 0 and 1, whereas binomial relies on binary outcomes (0 and 1)
# Gaussian probabilities accounts for continuous variables but is not unbounded, so quasibinomial keeps the outcome constrained 0-1.

summary(gam_oracle)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## Purchase_Probability ~ s(Price_USD, k = 5) + s(Dimension_mm, 
##     k = 5) + Feature_Count + Case_Material + s(Price_USD, by = Watch_Type)
## 
## Parametric coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.444637   0.012158 -36.573   <2e-16 ***
## Feature_Count1                0.101731   0.011252   9.042   <2e-16 ***
## Feature_Count2                0.193909   0.011091  17.484   <2e-16 ***
## Feature_Count3                0.316465   0.010982  28.816   <2e-16 ***
## Feature_Count4                0.384284   0.011354  33.846   <2e-16 ***
## Feature_Count5                0.466387   0.010998  42.407   <2e-16 ***
## Feature_Count6                0.555194   0.010592  52.416   <2e-16 ***
## Case_MaterialPlastic         -0.681913   0.008539 -79.855   <2e-16 ***
## Case_MaterialStainless Steel -0.436293   0.008165 -53.435   <2e-16 ***
## Case_MaterialTitanium        -0.209383   0.008145 -25.707   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                     edf Ref.df        F p-value    
## s(Price_USD)                     1.0007  1.001    0.216   0.642    
## s(Dimension_mm)                  3.9968  4.000  714.166  <2e-16 ***
## s(Price_USD):Watch_TypeBudget    2.7667  2.946 1189.306  <2e-16 ***
## s(Price_USD):Watch_TypeLuxury    0.9034  1.372    0.627   0.387    
## s(Price_USD):Watch_TypeMid-Range 1.0005  1.001    0.093   0.762    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 44/45
## R-sq.(adj) =  0.922   Deviance explained = 91.7%
## GCV = 0.0039351  Scale est. = 0.0038122  n = 2000

For the gam_oracle, all predictors, apart from the price variable and the interactions between price and luxury watch types are statistically insignificant at the 5% level.

A very well-fitted model, with high R-sq. value at 0.922, high deviance explained at 91.7%. There could be an issue of over-fitting however.

Interaction terms for Luxury and Mid-Range watch types are not significant, but budget watches are. This reflects the logarithmic interactions Luxury and Mid-Range watch types have with the price. For example, going from $5,000 to $10,000 may have a stronger effect on purchase probability than going from $50,000 to $55,000 for the same Watch_Type (i.e. Luxury).

summary(gam_simple)
## 
## Family: quasibinomial 
## Link function: logit 
## 
## Formula:
## Purchase_Probability ~ s(Price_USD, k = 3) + s(Dimension_mm, 
##     k = 3) + Watch_Type + Case_Material + Feature_Count
## 
## Parametric coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.90491    0.01919 -47.161  < 2e-16 ***
## Watch_TypeLuxury              0.49271    0.01911  25.784  < 2e-16 ***
## Watch_TypeMid-Range           0.45787    0.01333  34.362  < 2e-16 ***
## Case_MaterialPlastic         -0.68262    0.01441 -47.387  < 2e-16 ***
## Case_MaterialStainless Steel -0.43619    0.01376 -31.692  < 2e-16 ***
## Case_MaterialTitanium        -0.20520    0.01374 -14.938  < 2e-16 ***
## Feature_Count1                0.09204    0.01900   4.843 1.38e-06 ***
## Feature_Count2                0.17832    0.01871   9.529  < 2e-16 ***
## Feature_Count3                0.30345    0.01852  16.384  < 2e-16 ***
## Feature_Count4                0.38064    0.01914  19.887  < 2e-16 ***
## Feature_Count5                0.43622    0.01853  23.542  < 2e-16 ***
## Feature_Count6                0.54311    0.01787  30.391  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df       F p-value    
## s(Price_USD)    1.000      1   0.376    0.54    
## s(Dimension_mm) 1.998      2 297.199  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.778   Deviance explained =   76%
## GCV = 0.011367  Scale est. = 0.010941  n = 2000

The R^2 value and deviance explained values are lower than the very high values from the gam_oracle model.
This suggests the failure to capture the complexities from the data and the CEF (with the interaction between Watch_Type and Price_USD) in the gam_simple model reduced the performance quality.
However, due to the rest of the data and CEF being simplistic, the model’s performance remains effective, explaining the majority of the variance.

s(Price_USD) having high p-values in both models suggests price alone does not have a significant non-linear effect on purchase probability.

The simple GAM, as expected, performs worse than the oracle GAM. This is because the simple GAM failed to capture the interaction between the price and watch type, which was included in both the data simulation and the CEF.
However, the simple GAM remains a well-fitted model, due to the simplicity of the CEF, where all other variables had no interaction and the GAM managed to capture the complex relationships of the numeric variables (such as the dimension)

Diagnostic plots

# Diagnostic plots for both GAMs
par(mfrow = c(2, 2))
gam.check(gam_oracle)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 13 iterations.
## Gradient range [-6.918234e-10,2.122863e-09]
## (score 0.003935064 & scale 0.003812158).
## Hessian positive definite, eigenvalue range [6.00479e-10,1.859733e-06].
## Model rank =  44 / 45 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                                     k'   edf k-index p-value    
## s(Price_USD)                     4.000 1.001    1.00    0.57    
## s(Dimension_mm)                  4.000 3.997    0.63  <2e-16 ***
## s(Price_USD):Watch_TypeBudget    9.000 2.767    1.00    0.53    
## s(Price_USD):Watch_TypeLuxury    9.000 0.903    1.00    0.53    
## s(Price_USD):Watch_TypeMid-Range 9.000 1.000    1.00    0.48    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam.check(gam_simple)

## 
## Method: GCV   Optimizer: outer newton
## full convergence after 9 iterations.
## Gradient range [-1.821158e-10,1.014147e-08]
## (score 0.01136748 & scale 0.01094072).
## Hessian positive definite, eigenvalue range [1.821094e-10,4.802602e-08].
## Model rank =  16 / 16 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                 k' edf k-index p-value    
## s(Price_USD)     2   1    0.55  <2e-16 ***
## s(Dimension_mm)  2   2    0.70  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The gam_oracle residuals vs. linear predictor plot show a and tighter spread across the linear predictor axis, with less obvious patterns, shapes or trends. I.e. the results show residuals are homoscedastic.
Most of the residuals are close to 0

In gam_simple the residuals also appear random and fairly evenly scattered around 0, showing homoscedasticity. However, the spread appears larger in some parts, such as lower linear predictors.

Homoscedasticity is desirable in residual vs linear plots, because it means residuals are equally spread across all levels of predictor. If residual spread increases or decreases in certain regions, predictions in said-regions may be less reliable.

In the histogram of residuals plot, for gam_oracle, the residuals appear tighter around 0 with less spread along the x-axis (Residuals), compared to gam_simple, which has a slightly broader distribution (the magnitude of residuals can be as high as 0.4) and a higher frequency of residuals away from 0.

Response vs. Fitted Values plot in gam_oracle align much closer and tigher to a diagonal than for gam_simple, directly proportional line, suggesting more accuracy, less error and better prediction.

Normal Q-Q plot appears more diagonally in gam_oracle, suggesting deviance residuals approximate a normal distribution better than gam_simple.

# Plot predictions vs actual for both GAMs with Watch_Type key
pred_actual_oracle <- ggplot(watch_data, aes(x = predict(gam_oracle, newdata = watch_data, type = "response"), y = Purchase_Probability, color = Watch_Type)) +
  # type="response" to make predictions on the same scale as the actual values
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color="black") +
  labs(title = "Predictions vs Actual (GAM Oracle)",
       x = "Predicted Purchase Probability",
       y = "Actual Purchase Probability",
       color = "Watch Type") +
  theme_minimal()

pred_actual_simple <- ggplot(watch_data, aes(x = predict(gam_simple, newdata = watch_data, type = "response"), y = Purchase_Probability, color = Watch_Type)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color="black") +
  labs(title = "Predictions vs Actual (GAM Simple)",
       x = "Predicted Purchase Probability",
       y = "Actual Purchase Probability",
       color = "Watch Type") +
  theme_minimal()
grid.arrange(pred_actual_oracle, pred_actual_simple, ncol = 2)

Both plots show a strong diagonal line trend, where predicted probabilities align closely with actual purchase probabilities. This indicates both the GAM Oracle and GAM Simple models are performing reasonably well in terms of capturing the overall trends in the data.

The tight clustering along the diagonal suggests a high level of agreement between predictions and actual values, particularly for mid-range probabilities (0.2 to 0.5).

In the Oracle GAM plot, predictions appear tighter around the diagonal line trend, indicating that the Oracle model is performing better than the Simple model. This is consistent with the higher R-squared (adjusted) and deviance explained in the Oracle model summary compared to the Simple model.

In the Simple GAM plot, there is more spread in the predictions, particularly for budget watches at low actual probabilities.

This could indicate that the Simple model is less effective at capturing the complexities (e.g. interactions between Watch_Type and Price_USD) from the data and CEF, where budget watches deviated the furthest from the black line, resulting from the lack of interaction in the model. Including interaction terms provided better predictive performance.

PDPs (Partial-dependence Plots)

Partial-dependence plots (PDPs) show the marginal effect of one or more predictors on the predicted outcome of a model.

Price_USD was chosen for the PDPs because of its complex non-linear relationship with the outcome, where the relationship was logarithmic for luxury and mid-range watch types.

Dimension_mm was also chosen because of its non-linear relationship with the outcome, where watches too big or small were less desirable than the ideal mid-range (36-42mm).

Other variables were either categorical, which makes PDP plots less useful, and Feature_Count was modeled as a simple linear effect in the CEF (more features, higher probability), which makes Feature_Count less visually informative.

# PDP and ICE for Price_USD in gam_simple
pdp_price_simple <- partial(
  object = gam_simple,
  pred.var = "Price_USD",
  newdata = watch_data, 
  pred.fun = function(object, newdata) predict(object, newdata, type = "response")
) # type="response" to make predictions on the same scale as the actual values

# Plot PDP
plot_price_simple <- autoplot(pdp_price_simple) +
  ggtitle("PDP: Price_USD (GAM Simple)") +
  xlab("Price (USD)") +
  ylab("Purchase Probability") +
  theme_minimal()

# PDP and ICE for Price_USD in gam_oracle
pdp_price_oracle <- partial(
  object = gam_oracle,
  pred.var = "Price_USD",
  newdata = watch_data,
  pred.fun = function(object, newdata) predict(object, newdata, type="response")
)

# Plot PDP
plot_price_oracle <- autoplot(pdp_price_oracle) +
  ggtitle("PDP: Price_USD (GAM Oracle)") +
  xlab("Price (USD)") +
  ylab("Purchase Probability") +
  theme_minimal()

grid.arrange(plot_price_simple, plot_price_oracle, ncol = 2, nrow=1)

The red line shows the PDP (partial-dependence plot), which represents the average effect of Price_USD on Purchase_Probability across all prices.

In the simple GAM, the PDP line (as well as the black ICE lines) is flat, indicating that Price_USD has little to no significant average effect on Purchase_Probability.
This suggests that, without interactions, the model does not capture a meaningful relationship between price and purchase probability across watch types, as reflected in summary(gam_simple).

PDP/ICE plot for gam_oracle captures non-linear interactions between price and Watch_Type, leading to more compelx patterns:

  • For Budget watches (low prices), increasing Price_USD decreases the purchase probability initially, reflecting price sensitivity from the CEF with budget watches.

  • Around $5,000, the effect flattens, indicating a weak relationship between price and purchase probability for Mid-Range watches. This reflects diminishing sensitivity to price increases in Mid-Range watches.

  • At approximately $40,000, there is a sudden increase in purchase probability, followed by a stabilisation, which captures the logarithmic interaction of Price_USD with Luxury watches from the CEF.

PDP for gam_oracle provides a richer interpretation by incorporating interactions, showing that the effect of price on purchase probability depends on the watch type. In contrast, gam_simple fails to capture these relationships, oversimplifying the role of Price_USD.

The ICE in gam_oracle displays heterogeneity in the predictor’s effect, aligning with the CEF in the data. As ICE (individual conditional expectation) displays the effect of a predictor (in this case, Price_USD) on the predicted outcome for each observation in the dataset, some ICE lines are completely flat, others follow the trend of the PDP (which is the average of all ICE lines). Price’s effect on Purchase_Probability depending on Watch_Type, each type having differing, yet overlapping price ranges.

set.seed(candidate_number)
# PDP and ICE for Dimension_mm in gam_simple
pdp_dimension_simple <- partial(
  object = gam_simple,
  pred.var = "Dimension_mm",
  newdata = watch_data,
  pred.fun = function(object, newdata) predict(object, newdata, type = "response")
)

# Plot PDP/ICE gam_simple
plot_dimension_simple <- autoplot(pdp_dimension_simple) +
  ggtitle("PDP: Dimension_mm (GAM Simple)") +
  xlab("Dimension (mm)") +
  ylab("Purchase Probability") +
  theme_minimal()

# PDP and ICE for Dimension_mm in gam_oracle
pdp_dimension_oracle <- partial(
  object = gam_oracle,
  pred.var = "Dimension_mm",
  newdata = watch_data,  # Use watch_data for predictions
  pred.fun = function(object, newdata) predict(object, newdata, type = "response")
)

# Plot PDP/ICE gam_simple
plot_dimension_oracle <- autoplot(pdp_dimension_oracle) +
  ggtitle("PDP: Dimension_mm (GAM Oracle›)") +
  xlab("Dimension (mm)") +
  ylab("Purchase Probability") +
  theme_minimal()

grid.arrange(plot_dimension_simple, plot_dimension_oracle, ncol = 2, nrow=1)

The gam_simple PDP shows a concave curve between dimension and the marginal effects it has on the Purchase_Probability. Probability peaks at roughly 40mm, reflecting the optimal range of watch dimensions being between 36-42mm from the CEF and that watches below or above that range are less likely to be purchased.

The PDP curve for gam_oracle is more complex:

  • The PDP begins convex, showing a gradual increase in purchase probability as Dimension_mm increases from its lower range.

  • Between 35 mm and 38 mm, the rate of increase slows down, marking the first point of inflection.

  • The PDP reaches its maximum value at approximately 40 mm, reflecting the optimal dimension range (36-42mm) outlined in the CEF

  • Beyond the peak, the PDP shows an initial gradual decline in purchase probability, indicating that larger dimensions start to reduce the likelihood of purchase.

  • Around 42 mm to 44 mm, the rate of decline slows down, representing a second point of inflection where the negative impact begins to level off.

After reaching a minimum point between 45 mm and 47 mm, the PDP begins to rise again, which is not reflective from the CEF, showing that any dimension values beyond 42mm should decrease the purchase probability.

The ICE lines are all parallel and very closely aligned across all observations, suggesting homogeneity in both the simple and oracle GAMs, where each ICE line follows the same pattern.

3. Tree based models

set.seed(candidate_number)
# 10-fold cross-validation (for all tree models)
watch_cv <- vfold_cv(watch_data, v = 10)

# Create the recipe (for all tree models)
watch_recipe <- recipe(Purchase_Probability ~ ., data = watch_data) %>%
  step_dummy(all_nominal_predictors())
# Encode the categorical (nominal) predictors with step_dummy() to ensure each category is treated as independent and prevent any assumptions about the "ordering" of categories

3.1 Simple tree model

# Simple tree model
set.seed(candidate_number)
watch_tree <- decision_tree(tree_depth = 6, cost_complexity = tune("C")) %>%
  set_engine("rpart") %>%
  set_mode("regression")

# Simple tree workflow
watch_workflow_tree <- workflow() %>%
  add_recipe(watch_recipe) %>%
  add_model(watch_tree)

# Tune the simple tree model
watch_fit_tree <- tune_grid(
  watch_workflow_tree,
  resamples = watch_cv,
  grid = data.frame(C = 2^(-15:0)),
  metrics = metric_set(rmse, rsq, mae),
)
# Visualise cost-complexity vs metrics
# lower cost-complexity will improve the performance (lower mae and rmse, higher rsq)
autoplot(watch_fit_tree)

A higher cost-complexity means a large penalty is applied to the tree’s complexity (no. splits or terminal nodes). The model is encouraged to prune the tree aggressively, resulting in smaller, simpler trees.

There is a discontinuity in the rsq, which arises likely due to the oversimplification of the model leading to constant predictions for all observations.
The standard deviation of the predicted values becomes 0, which means R^2 is NA, because of a divide by 0.

select_best(watch_fit_tree, metric = "rmse")
## # A tibble: 1 × 2
##           C .config              
##       <dbl> <chr>                
## 1 0.0000305 Preprocessor1_Model01

A cost-complexity value of 3.051758e is the hyperparamter for optimal performance in the simple tree.

set.seed(candidate_number)
# Select the best hyperparameters based on RMSE
best_tree_params <- select_best(watch_fit_tree, metric = "rmse")

# Finalize the workflow with the best parameters
final_watch_workflow_tree <- finalize_workflow(watch_workflow_tree, best_tree_params)

# Fit the finalized workflow on the full dataset
final_watch_fit_tree <- fit(final_watch_workflow_tree, data = watch_data)

# Predict on the full dataset
tree_predictions <- predict(final_watch_fit_tree, new_data = watch_data) %>%
  bind_cols(watch_data)

ggplot(tree_predictions, aes(x = .pred, y = Purchase_Probability)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + # line of perfect prediction (y=x)
  labs(
    title = "Predicted vs Actual - Simple Decision Tree",
    x = "Predicted Purchase Probability",
    y = "Actual Purchase Probability"
  ) +
  theme_minimal()

Visibly, there’s a clear diagonal relationship between the predicted and actual values, however many points deviate quite far from the red line (the line of perfect prediction), suggesting instability and inaccuracy in prediction.

The points cluster vertically at certain predicted values, suggesting an oversimplified model being unable to capture complex relationships between the predictors and outcome. It shows that many observations in the data having the same predicted value, even when their actual values differ (a step-like prediction).

We do not know, however, which specific predictors are driving the issues of vertical clustering

# Residual plot
ggplot(tree_predictions, aes(x = Purchase_Probability, y = Purchase_Probability - .pred)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Residual Plot - Simple Decision Tree",
    x = "True Purchase Probability",
    y = "Residuals"
  ) +
  theme_minimal()

The residuals aren’t randomly scattered around the horizontal red line (of perfect prediction). A clear diagonal structure is displayed, where the spread of residuals increases as the probability increases.
In addition, the step-like prediction is further reinforced by the many separate diagonal trends throughout the plot.

This is a display of heteroscedasticity, where the model doesn’t predict equally well across all target ranges.

However, this plot doesn’t provide insight on which features may be causing higher residuals and an undersirable heteroscedasticity relationship. We don’t know whether each variable contributed equally to the residuals or if one contributed disproportionately to the others

# 3 metrics for simple tree
metrics_tree <- metrics(
  data = tree_predictions,
  truth = Purchase_Probability,
  estimate = .pred
)

print(metrics_tree)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0441
## 2 rsq     standard      0.808 
## 3 mae     standard      0.0349

The metrics show a relatively strong predictive performance for the simple tree, with a low rmse and mae value and a high rsq values.

# Feature importance plot - simple decision tree
set.seed(candidate_number)
# Extract tree model fit for variable importance
tree_model <- extract_fit_parsnip(final_watch_fit_tree)$fit
importance_tree <- tree_model$variable.importance

# Convert to a data frame for ordering
importance_tree_df <- as.data.frame(importance_tree) %>%
  rownames_to_column(var = "Variable") %>%
  rename(Importance = importance_tree)

# Re-order by importance (descending)
importance_tree_df <- importance_tree_df %>%
  arrange(desc(Importance))

ggplot(importance_tree_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Feature Importance - Simple Decision Tree", x = "Feature", y = "Importance") +
  theme_minimal()

Price_USD was the most important feature, followed closely by plastic case materials and then dimensions. However, it does not show how the features (such as price) affects the outcome.
It is important to note that feature importance plots may be biased towards numeric variables or categorical features with more categories, as they offer more potential splits.

More importantly, the feature plot did not capture the interaction of watch_types and Price_USD from the CEF. The plot is perhaps misleading, where Price_USD was considered an insignificant predictor in both GAM models, whereas it was considered the most important feature here.

Limitations

The simple tree provides interpretability and simplicity, but they tend to underfit data, oversimplify patterns and losing detail (variations within regions), by lacking the flexibility to.

Since the CEF involved an interaction between Price_USD and Watch_Type, the simple tree may have been too simplistic and may have struggle to capture the more complex relationships.
Resulting in higher error terms (higher RMSE and MAE) and low variance explained (low rsq), meaning low accuracy and greater, heteroscedastic residuals.

3.2 Random forest decision-tree

set.seed(candidate_number)
watch_rf <-
rand_forest(trees = 100, mtry = tune()) %>%
set_mode("regression") %>%
set_engine("randomForest")

# add watch recipe to RF workflow
watch_workflow_rf <- workflow() %>%
add_recipe(watch_recipe) %>%
add_model(watch_rf)

watch_fit_rf <- tune_grid(
watch_workflow_rf,
watch_cv,
grid = expand.grid(mtry = 1:5), # Only having 5 predictors, same as the dataset
metrics = metric_set(rmse, rsq, mae)
)

autoplot(watch_fit_rf)

5 randomly selected predictors was the optimal number for the randomForest model, the same as the number of predictors in the dataset.

set.seed(candidate_number)
# Select best hyperparameters (i.e. 5 randomly selected predictors)
best_rf_params <- select_best(watch_fit_rf, metric = "rmse")

# Finalize the workflow with the best parameters
final_workflow_rf <- finalize_workflow(watch_workflow_rf, best_rf_params)

# Fit the finalized workflow on the full dataset
final_fit_rf <- fit(final_workflow_rf, data = watch_data)

# Step 4: Predict on the full dataset
rf_predictions <- predict(final_fit_rf, new_data = watch_data) %>%
  bind_cols(watch_data)

ggplot(rf_predictions, aes(x = .pred, y = Purchase_Probability)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Predicted vs Actual - Random Forest Decision Tree",
    x = "Predicted Purchase Probability",
    y = "Actual Purchase Probability"
  ) +
  theme_minimal()

Compared to the simple tree model, the points are scattered much more closely and tighter around the red line of perfect prediction. This suggests a better fitted model.

ggplot(rf_predictions, aes(x = Purchase_Probability, y = Purchase_Probability - .pred)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Residual Plot - RandomForest Decision Tree",
    x = "Actual Purchase Probability",
    y = "Residuals"
  ) +
  theme_minimal()

The residuals plot shows the model performs reasonably well, better than the simple tree. The residuals are much closer to the line of perfect prediction, and only deviate as far as ~0.1 from the line.
A lack of clear pattern suggests the model captures the complex relationships in the data effectively without severe bias.
However, due to increasing spread at higher purchase probabilities, it suggests a slight violation of homoscedasticity, as residuals start to skew positively.

metrics_rf <- metrics(
  data = rf_predictions,
  truth = Purchase_Probability,
  estimate = .pred
)

print(metrics_rf)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0206
## 2 rsq     standard      0.965 
## 3 mae     standard      0.0161

Metrics suggest a very well-fitted model, with high R^2 and very low RMSE and MAE values. Much better than the simple tree model. Shows that the RF model was able to capture complex relationships within the data.

# Variable importance plot
set.seed(candidate_number)
rf_model <- extract_fit_parsnip(final_fit_rf)$fit
varImpPlot(rf_model, main = "Variable Importance - Random Forest")

Similar ranking of feature importance to the simple tree, but giving more importance to higher watch feature counts and mid-range watch types.

Limitations of feature importance plots outlined in the simple tree feature plot persist.

Limitations

Since randomForest is a bagging method, it is more computationally costly than the simple tree model, as well as more complex and less interpretable.

Tuning hyperparameters is crucial to prevent both overfitting and underfitting.

3.3 Boosted decision-tree

# Boosted tree (high loading time, 1-2 mins)
set.seed(candidate_number) # For reproducibility

watch_boost <-
boost_tree(trees = tune(),
learn_rate = tune()) %>% # tuning parameters
set_mode("regression") %>%
set_engine("xgboost")

watch_workflow_boost <- workflow() %>%
add_recipe(watch_recipe) %>%
add_model(watch_boost)

watch_fit_boost <- tune_grid(
watch_workflow_boost, 
  resamples = watch_cv,
  metrics = metric_set(rmse, rsq, mae),
)
# what number of trees and learning rate yield optimal performance
select_best(watch_fit_boost, metric = "rmse")
## # A tibble: 1 × 3
##   trees learn_rate .config              
##   <int>      <dbl> <chr>                
## 1   723     0.0187 Preprocessor1_Model06
# autoplot to show model performance at different learning rates and number of trees
autoplot(watch_fit_boost)

The autoplots show a moderate number of trees (at 723) and a moderate learning rate (0.0186521) yield the optimal performance.

# Select best parameters for boosting
best_boost_params <- select_best(watch_fit_boost, metric = "rmse")

# Finalize the workflow with best parameters
final_workflow_boost <- finalize_workflow(watch_workflow_boost, best_boost_params)

# Fit finalized workflow on the full dataset
final_fit_boost <- fit(final_workflow_boost, data = watch_data)

# Step 4: Predict on the full dataset
boost_predictions <- predict(final_fit_boost, new_data = watch_data) %>%
  bind_cols(watch_data)

ggplot(boost_predictions, aes(x = Purchase_Probability, y = .pred)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "True vs Predicted - Random Forest Decision Tree",
    x = "True Purchase Probability",
    y = "Predicted Purchase Probability"
  ) +
  theme_minimal()

The points lie much tighter and more closely around the red line than the simple tree and randomForest models, suggesting a very well-fit model to the training set, not far from perfect.

ggplot(boost_predictions, aes(x = Purchase_Probability, y = Purchase_Probability - .pred)) +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") + # Line of perfect prediction (y=0)
  labs(
    title = "Residual Plot - Simple Decision Tree",
    x = "True Purchase Probability",
    y = "Residuals"
  ) +
  theme_minimal()

Residual points lie very closely to 0, only going as far as just over 0.05, where points in RF model would lie as far as 0.1, even further for the simple tree model. No obvious pattern, indicating homoscedasticity

metrics_boost <- metrics(
  data = boost_predictions,
  truth = Purchase_Probability,
  estimate = .pred
)

print(metrics_boost)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     0.0121 
## 2 rsq     standard     0.986  
## 3 mae     standard     0.00932

The metrics for the boosted model shows that it is the best performing model out of the 3. The lowest RMSE and MAE, and the highest R^2 (nearly 1).

xgb_model <- extract_fit_parsnip(final_fit_boost)$fit
importance_xgb <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix = importance_xgb, rel_to_first = TRUE, xlab = "Relative Importance")

Price had the highest variable importance, followed by plastic case materials and dimension, yet again.

Watch types were given much less importance than the other 2 models.

The limitations of the feature importance plots outlined in the simple tree still persist, such as not being able to capture interactive effects (especially given the diminished importance of watch types) and bias towards numerical variables.

Limitations

Boosted trees are computationally expensive, taking over a minute to run, more than simple trees and even randomForest, due to their iterative nature.

Boosted trees are more complex and less interpretable than the other 2 models.

They’re less prone to underfitting the data but are more prone to overfitting if hyperparamters are not well-tuned.

4. ID generalisation (unseen data, same distribution)

To determine which tree model has the best test accuracy, it must have:

set.seed(candidate_number)  # Ensure reproducibility

watch_data$Feature_Count <- as.factor(watch_data$Feature_Count)
watch_split <- initial_split(watch_data, prop = 0.7) # Used 70% of data for training, because the data is large (2000 entries) and the CEF is not very complex
id_train <- training(watch_split)
id_test <- testing(watch_split)

4.1 Random Forest

# Update the recipe, change data to training data than entire dataset
id_train_recipe <- recipe(Purchase_Probability ~ ., data = id_train) %>%
  step_dummy(all_nominal_predictors())

# add model and id recipe to RF workflow
id_workflow_rf <- workflow() %>%
  add_recipe(id_train_recipe) %>%
  add_model(watch_rf)

tuned_rf <- tune_grid(
  id_workflow_rf,
  resamples = vfold_cv(id_train, v = 10),
  grid = expand.grid(mtry = 1:5), # Replace with appropriate grid
  metrics = metric_set(rmse, rsq)
)

# select best number of randomly selected predictors
best_rf_params <- select_best(tuned_rf, metric = "rmse")

# finalize workflow
final_id_workflow_rf <- finalize_workflow(id_workflow_rf, best_rf_params)

# fit finalized model on the ID training data
fit_id_rf_model <- fit(final_id_workflow_rf, data = id_train)

# predict on the ID test set
id_predictions_rf <- predict(fit_id_rf_model, new_data = id_test) %>%
  bind_cols(id_test) %>%
  metrics(truth = Purchase_Probability, estimate = .pred)

id_predictions_rf
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0339
## 2 rsq     standard      0.896 
## 3 mae     standard      0.0262

4.2 Simple tree

# Update new recipe onto ID workflow and add simple tree model
id_workflow_tree <- workflow() %>%
  add_recipe(id_train_recipe) %>% # Updated rcipe
  add_model(watch_tree)

set.seed(candidate_number)
tuned_tree <- tune_grid(
  id_workflow_tree,
  resamples = watch_cv,  # Cross-validation folds
  grid = data.frame(C = 2^(-15:0)),  # Grid of Cost-complexity values. Allow for low C values (higher complexity)
  metrics = metric_set(rmse, rsq, mae)
)

# Select the best Cost-complexity value
best_tree_params <- select_best(tuned_tree, metric = "rmse") # Same C value will likely correspond for highest R^2 too

# Finalize the workflow with best hyperparameters
final_tree_workflow <- finalize_workflow(
  watch_workflow_tree,
  best_tree_params
)

# Fit model to training data
final_tree_model <- fit(final_tree_workflow, data = id_train)

# Predict on the test data
id_predictions_tree <- predict(final_tree_model, id_test) %>%
  bind_cols(id_test)

# Calculate metrics
metrics_tree_id <- metrics(
  data = id_predictions_tree,
  truth = Purchase_Probability,
  estimate = .pred
)

print(metrics_tree_id)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0502
## 2 rsq     standard      0.746 
## 3 mae     standard      0.0386

4.3 Boosted tree

set.seed(candidate_number)

id_workflow_boost <- workflow() %>%
  add_recipe(id_train_recipe) %>%  # Updated recipe
  add_model(watch_boost)

# Tune hyperparameters to ensure generalisation
boost_tuning_results <- tune_grid(
  id_workflow_boost,
  resamples = watch_cv,  # 10 cross-validation folds
  grid = expand.grid(
    trees = c(50, 100, 150),
    learn_rate = c(0.01, 0.05, 0.1)
  ),
  metrics = metric_set(rmse, rsq, mae)
)

# Select best hyperparameters
best_boost_params_id <- select_best(boost_tuning_results, metric = "rmse")

# Finalize workflow with best parameters
final_boost_workflow_id <- finalize_workflow(
  id_workflow_boost,
  best_boost_params
)

# Fit finalized model to training data
final_boost_model_id <- fit(final_boost_workflow_id, data = id_train)

# Predict on test data
id_predictions_boost <- predict(final_boost_model_id, id_test) %>%
  bind_cols(id_test)

# Extract metrics
metrics_id_boost <- metrics(
  data = id_predictions_boost,
  truth = Purchase_Probability,
  estimate = .pred
)

print(metrics_id_boost)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0275
## 2 rsq     standard      0.924 
## 3 mae     standard      0.0207

4.4 ID Table Comparison

RMSE R^2 MAE
Random forest 0.0339 0.896 0.0262
Simple tree 0.0502 0.746 0.0386
Boosted tree 0.0275 0.924 0.0207

Judging from the metrics, the boosted tree is the superior tree model out of the 3. With the lowest rmse, and mae and the highest rsq values.

5. OOD generalisation (unseen data, different distribution)

# Plot boosted decision tree from ID generalisation
xgb_model <- extract_fit_parsnip(final_boost_model_id)$fit
importance_xgb <- xgb.importance(model = xgb_model)
xgb.plot.importance(importance_matrix = importance_xgb, rel_to_first = TRUE, xlab = "Relative Importance")

To identify which features contributed most to the predictions in the ID model, I did a feature importance plot of the ID model. It allowed me to target these features when designing the OOD shifts, and ensured the shifts were small but meaningful.

5.1 Concept shift

5.1.1 Worse performance: linear price-type interaction effect

set.seed(candidate_number)

CEF_worse <- function(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD) {
  base <- 0.3
  
  material_effect <- ifelse(
    Case_Material == "Gold", 0.1,
    ifelse(Case_Material == "Titanium", 0.05,
           ifelse(Case_Material == "Stainless Steel", 0, -0.05))
  )
  
  Feature_Count <- as.numeric(Feature_Count)
  feature_effect <- 0.02 * Feature_Count
  
  dimension_effect <- ifelse(
    Dimension_mm < 36, -0.05,
    ifelse(Dimension_mm <= 42, 0.05, -0.02)
  )
  
  # Linear price effect instead of log on luxury and budget watches from the original CEF
  price_effect <- ifelse(
    Watch_Type == "Luxury", 0.00001 * Price_USD,
    ifelse(Watch_Type == "Mid-Range", 0.000005 * Price_USD,
           -0.0001 * Price_USD)
  )
  
  noise <- rnorm(length(Watch_Type), mean = 0, sd = 0.02)
  
  prob <- base + material_effect + feature_effect + dimension_effect +
    price_effect + noise
  
  prob <- pmin(pmax(prob, 0), 1)
  return(prob)
}

watch_con_worse <- watch_data
watch_con_worse$Purchase_Probability <- with(
  watch_con_worse,
  CEF_worse(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD)
)
watch_con_worse$Feature_Count <- as.factor(watch_con_worse$Feature_Count)
# Predict the ID-trained model on worse OOD data
con_predictions_worse <- predict(final_boost_model_id, new_data = watch_con_worse) %>%
  bind_cols(watch_con_worse)

# Extract metrics
metrics_con_worse <- metrics(
  data = con_predictions_worse,
  truth = Purchase_Probability,
  estimate = .pred
)
print(metrics_con_worse)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.281
## 2 rsq     standard       0.326
## 3 mae     standard       0.182

I hypothesised that changing the price effect in the CEF, where the original CEF had a logarithmic relationship between the price and the outcome (purchase probability), to a linear relationship would make the model perform worse.

Price had the highest feature importance score from the boosted tree model, therefore awkward changes to the price effect, especially when it doesn’t align with the original CEF, would deteriorate the model performance.

Simplifying the price relationship in this way meant the shift couldn’t capture the more complex relationship between Price_USD and the Purchase_Probability, making it more difficult for the model to predict Purchase_Probability.

5.1.2 Better shift: removing noise

set.seed(candidate_number)

CEF_better <- function(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD) {
  base <- 0.3
  
  material_effect <- ifelse(
    Case_Material == "Gold", 0.1,
    ifelse(Case_Material == "Titanium", 0.05,
           ifelse(Case_Material == "Stainless Steel", 0, -0.05))
  )
  
  Feature_Count <- as.numeric(Feature_Count)
  feature_effect <- 0.02 * Feature_Count
  
  # Dimension effect
  dimension_effect <- ifelse(
    Dimension_mm < 36, -0.05, 
    ifelse(Dimension_mm <= 42, 0.05, -0.02)  
  )

  price_effect <- ifelse(
    Watch_Type == "Luxury", 0.001 * log(Price_USD + 1),
    ifelse(Watch_Type == "Mid-Range", 0.0005 * log(Price_USD + 1),
           -0.0001 * Price_USD)
  )
  
  # Add noise for randomness and to reduce overfitting
  
  # Calculate probabilities
  prob <- base + material_effect + feature_effect + dimension_effect + price_effect
  
  # Clip probabilities to range [0, 1]
  prob <- pmin(pmax(prob, 0), 1)
  
  return(prob)
}

watch_con_better <- watch_data
watch_con_better$Purchase_Probability <- with(
  watch_con_better,
  CEF_better(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD)
)
watch_con_better$Feature_Count <- as.factor(watch_con_better$Feature_Count)
set.seed(candidate_number)
# Predict the ID-trained model on better OOD data
con_predictions_better <- predict(final_boost_model_id, new_data = watch_con_better) %>%
  bind_cols(watch_con_better)

# Extract metrics
metrics_con_better <- metrics(
  data = con_predictions_better,
  truth = Purchase_Probability,
  estimate = .pred
)
print(metrics_con_better)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0248
## 2 rsq     standard      0.978 
## 3 mae     standard      0.0214

Removing the noise from the CEF made the concept shift model perform better than the ID, as shown by the metrics (lower rmse and mae values with a higher rsq value).

Since the model utilises boosted trees, where they train models sequentially, decreasing residuals by a small amount each time, this means they’re particularly prone to overfitting to noise.

By removing noise, the model focused purely on learning the deterministic relationships encoded in the CEF, leading to better alignment between predictions and the actual target values.

5.2 Covariate shift

5.2.1 Worse shift: Watch_Type imbalances

set.seed(candidate_number)
watch_data$Feature_Count <- as.numeric(as.character(watch_data$Feature_Count))

# Make luxury watches a lot more common (70%)
watch_cov_worse <- watch_data
watch_cov_worse$Watch_Type <- factor(
  sample(c("Luxury", "Mid-Range", "Budget"), 2000, replace = TRUE, prob = c(0.7, 0.2, 0.1))
)

# Recalculate Purchase_Probability using the original CEF
watch_cov_worse$Purchase_Probability <- with(
  watch_cov_worse,
  CEF(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD)
)

watch_cov_worse$Feature_Count <- as.factor(watch_cov_worse$Feature_Count)
set.seed(candidate_number)
# Predict on worse OOD shift with ID model
predictions_cov_worse <- predict(final_boost_model_id, new_data = watch_cov_worse) %>%
  bind_cols(watch_cov_worse)

# Extract metrics
metrics_cov_worse <- metrics(
  data = predictions_cov_worse,
  truth = Purchase_Probability,
  estimate = .pred
)
print(metrics_cov_worse)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.122 
## 2 rsq     standard      0.204 
## 3 mae     standard      0.0699

In the original dataset, 40% of watches were Luxury; the covariate shift increased this to 70%, skewing the interaction between Watch_Type and Price_USD.

In the ID feature importance plot, Mid-Range watches contributed moderately, whilst Luxury watches contributed so little, reflecting a more meaningful interaction with Price_USD from Mid-Range watches. Thus, over-representing Luxury watches reduced diversity, under-representing Mid-Range watches and skewing the dataset, weakening the predictive power of the model.

The model overfit to luxury watch patterns, relying heavily on their Price_USD interactions. This imbalance degraded performance on a balanced test set due to poor generalization to underrepresented categories.

5.2.2 Better shift: narrower price range and higher feature counts

set.seed(candidate_number)

# Re-define Feature_Count as a numeric
watch_data$Feature_Count <- as.numeric(as.character(watch_data$Feature_Count))

watch_cov_better <- watch_data # Store in separate variable

# Narrower price ranges for each Watch_Type
watch_cov_better$Price_USD <- sapply(watch_cov_better$Watch_Type, function(type) {
  if (type == "Luxury") {
    round(runif(1, 15000, 30000), 2)
  } else if (type == "Mid-Range") {
    round(runif(1, 3000, 6000), 2) 
  } else {
    round(runif(1, 1000, 1500), 2) 
  }
})

# Increase Feature_Count toward higher values (4-6)
watch_cov_better$Feature_Count <- sample(4:6, nrow(watch_cov_better), replace = TRUE)


# Purchase_Probability using the original CEF
watch_cov_better$Purchase_Probability <- with(
  watch_cov_better,
  CEF(Watch_Type, Case_Material, Feature_Count, Dimension_mm, Price_USD)
)
# Convert Feature_Count back to a categorical
watch_cov_better$Feature_Count <- as.factor(watch_cov_better$Feature_Count)
set.seed(candidate_number)

# Predict on better OOD shift with ID model
predictions_cov_better <- predict(final_boost_model_id, new_data = watch_cov_better) %>%
  bind_cols(watch_cov_better)

# Extract metrics
metrics_cov_better <- metrics(
  data = predictions_cov_better,
  truth = Purchase_Probability,
  estimate = .pred
)
print(metrics_cov_better)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      0.0255
## 2 rsq     standard      0.931 
## 3 mae     standard      0.0201

A covariate shift that improved model performance involved narrowing price ranges within each Watch_Type and constraining Feature_Count to higher values (4–6).

In the original data, broad price ranges introduced variability which may have reduced signal strength. By narrowing these ranges, the model better distinguished purchase probabilities tied to Price_USD, ranked the most important predictor.

High feature counts had more importance than lower counts, thus shifting feature counts to higher values strengthened the relationship, aligning with CEF’s linear positive relationship with Feature_Count and Purchase_Probability

These adjustments reduced variability and aligned the data with the original CEF and boosted tree feature importance, resulting in better performance than the ID model.

5.3 Table Comparison

RMSE RSQ MAE
ID Generalisation (boosted tree) 0.0275 0.924 0.0207
Concept shift: worse 0.281 0.326 0.182
Concept shift: better 0.0248 0.978 0.0214
Covariate shift: worse 0.122 0.204 0.0699
Covariate shift: better 0.0255 0.931 0.0201